import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline 
plt.style.use(['bmh']) # bmh Styling used for Visulization


df = pd.read_csv("cleaned_car_buyers.csv")
print ('Data read into a pandas dataframe!')

Data read into a pandas dataframe!


df.head() #Calling first 5 rows


df.sort_values(by=['Total'], inplace=True, ascending=False)
df


df.dtypes #Checking Datatypes

Manufacturer     object
Model            object
Price           float64
Transmission      int64
Power             int64
Engine_CC         int64
Fuel             object
Male              int64
Female            int64
Unknown           int64
Total             int64
dtype: object


df.describe() #Understanding the data


df.info(verbose=False) #Information about the dataframe

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5949 entries, 5129 to 746
Columns: 11 entries, Manufacturer to Total
dtypes: float64(1), int64(7), object(3)
memory usage: 557.7+ KB


df.isnull().sum()

Manufacturer    0
Model           0
Price           0
Transmission    0
Power           0
Engine_CC       0
Fuel            0
Male            0
Female          0
Unknown         0
Total           0
dtype: int64


df.corr() #Checking Correlation between columns


df.shape # size of dataframe (rows, columns)

(5949, 11)


df1 = df[['Total','Male','Female','Unknown','Manufacturer','Model','Price']] #creating new dataframe


df1 = df1.groupby(['Model'],as_index=False).max() #Using groupby function to create required pivot table
df1


df1.sort_values(by=['Total'], inplace=True, ascending=False) #sorting values in decending order to move top ten Ownership to top
df1


df1 = df1.head(10) #Filtering first 10 rows 
df1


# Creating the Bar Graph to show Top 10 Vehicles Ownership by gender
df1.plot(x="Model", y=["Male", "Female", "Unknown"], kind="barh",title="Top 10 Vehicles Ownership by Gender",xlabel = "Car Vehicles",figsize=(10,10))

<AxesSubplot:title={'center':'Top 10 Vehicles Ownership by Gender'}, ylabel='Car Vehicles'>


df2 = df[['Manufacturer','Model','Price','Power']] #creating new dataframe
df2


sns.displot(df2['Price'], color="r")

<seaborn.axisgrid.FacetGrid at 0x20980bebee0>


# Let's declare a variable that will be equal to the 99th percentile of the 'Price' variable
q = df2['Price'].quantile(0.99)
# Then we create a new df, with the condition that all prices must be below the 99 percentile of 'Price'
df3 = df2[df2['Price']<q]
# In this way we have essentially removed the top 1% of the data about 'Price'
df3.describe(include='all')


# We can check the PDF once again to ensure that the result is still distributed in the same way overall
# however, there are much fewer outliers
sns.displot(df3['Price'],color="g")

<seaborn.axisgrid.FacetGrid at 0x20980da7550>


sns.displot(df2['Power'])

<seaborn.axisgrid.FacetGrid at 0x209806a52b0>


# Let's declare a variable that will be equal to the 99th percentile of the 'Power' variable
q = df3['Power'].quantile(0.99)
# Then we can create a new df, with the condition that all prices must be below the 99 percentile of 'Power'
df4 = df3[df3['Power']<q]
# In this way we have essentially removed the top 1% of the data about 'Power'
df4.describe(include='all')


# We check the PDF once again to ensure that the result is still distributed in the same way overall
# however, there are much fewer outliers
sns.displot(df4['Power'],color="g")

<seaborn.axisgrid.FacetGrid at 0x209807dfbb0>


#Checking correlation of Male with other features
df[df.columns[1:]].corr()['Male'][:]

Price          -0.068462
Transmission   -0.038833
Power          -0.084845
Engine_CC      -0.114367
Male            1.000000
Female          0.825118
Unknown         0.883884
Total           0.966446
Name: Male, dtype: float64


f, ax = plt.subplots(figsize=(12, 5))
c = sns.regplot(x="Male", y="Total", data=df, ax=ax)
plt.ylim(0,)
c.set_title('Total Correlation with Male', fontdict={'fontsize':18}, pad=16)

Text(0.5, 1.0, 'Total Correlation with Male')


f, ax = plt.subplots(figsize=(12, 5))
c1 = sns.regplot(x="Male", y="Transmission", data=df, ax=ax)
plt.ylim(0,)
c1.set_title('Transmission Correlation with Male', fontdict={'fontsize':18}, pad=16)

Text(0.5, 1.0, 'Transmission Correlation with Male')


f, ax = plt.subplots(figsize=(12, 5))
c2 = sns.regplot(x="Male", y="Price", data=df, ax=ax)
plt.ylim(0,)
c2.set_title('Price Correlation with Male', fontdict={'fontsize':18}, pad=16)

Text(0.5, 1.0, 'Price Correlation with Male')


plt.figure(figsize=(4, 10))
heatmap = sns.heatmap(df[df.columns[1:]].corr()[['Male']][1:].sort_values(by='Male', ascending=False), vmin=-1, vmax=1, annot=True, cmap='RdBu')
heatmap.set_title('Features and their Correlation with Male Column', fontdict={'fontsize':18}, pad=16)
plt.savefig('Male.png', dpi=300, bbox_inches='tight')

	Manufacturer	Model	Price	Transmission	Power	Engine_CC	Fuel	Male	Female	Unknown	Total
0	Renault	Clio	22.10000	5	75	1219	petrol	241287	312556	28004	581847
1	BMW	320i	47.84837	6	126	1995	petrol	408016	115843	29125	552984
2	Volkswagen	Polo	18.19250	5	60	1408	petrol	216333	299110	31701	547144
3	Peugeot	206	20.03375	4	71	1631	petrol	178698	250614	26135	455447
4	Ford	Mondeo	39.97375	1	130	1998	petrol	357452	69603	16550	443605

	Manufacturer	Model	Price	Transmission	Power	Engine_CC	Fuel	Male	Female	Unknown	Total
5129	Ford	Fiesta	18.532143	5	68	1166	petrol	730276	789633	41603	1561512
5353	Ford	Fiesta	18.532143	5	68	1166	petrol	737836	782479	39028	1559343
4889	Ford	Fiesta	18.532143	5	68	1166	petrol	710871	781033	44195	1536099
5354	Ford	Escort	16.208744	4	63	1650	petrol	989746	501907	35418	1527071
5765	Ford	Escort	16.208744	4	63	1650	petrol	1007113	489732	29926	1526771
...	...	...	...	...	...	...	...	...	...	...	...
4888	Opel	Ascona	15.118118	4	61	1603	petrol	795	185	24	1004
3214	Lada	Samara	10.915800	5	49	1372	petrol	773	212	19	1004
1124	Skoda	Favorit	7.863333	5	44	1289	petrol	685	281	38	1004
1860	Renault	21	20.353789	4	67	1996	petrol	784	178	41	1003
746	Mini	Roadster	32.658364	6	101	1424	petrol	291	628	82	1001

	Price	Transmission	Power	Engine_CC	Male	Female	Unknown	Total
count	5949.000000	5949.000000	5949.000000	5949.000000	5.949000e+03	5949.000000	5949.000000	5.949000e+03
mean	41.087399	4.854597	110.448311	1992.979660	3.312574e+04	20087.933602	2461.825181	5.567482e+04
std	51.836351	1.241327	67.076695	806.659888	8.098990e+04	60387.333760	5739.449368	1.406078e+05
min	3.332200	0.000000	15.000000	469.000000	2.910000e+02	30.000000	13.000000	1.001000e+03
25%	18.198429	4.000000	70.000000	1498.000000	2.804000e+03	881.000000	167.000000	3.958000e+03
50%	28.960200	5.000000	94.000000	1833.000000	8.180000e+03	3409.000000	561.000000	1.293900e+04
75%	43.548516	6.000000	121.000000	2247.000000	2.567300e+04	12287.000000	1964.000000	4.127100e+04
max	646.605000	8.000000	450.000000	6208.000000	1.007113e+06	789633.000000	55632.000000	1.561512e+06

	Price	Transmission	Power	Engine_CC	Male	Female	Unknown	Total
Price	1.000000	0.424442	0.857374	0.770028	-0.068462	-0.100583	-0.083540	-0.086043
Transmission	0.424442	1.000000	0.547033	0.292420	-0.038833	-0.047664	-0.023994	-0.043816
Power	0.857374	0.547033	1.000000	0.870942	-0.084845	-0.142175	-0.109509	-0.114402
Engine_CC	0.770028	0.292420	0.870942	1.000000	-0.114367	-0.182725	-0.162236	-0.150975
Male	-0.068462	-0.038833	-0.084845	-0.114367	1.000000	0.825118	0.883884	0.966446
Female	-0.100583	-0.047664	-0.142175	-0.182725	0.825118	1.000000	0.921717	0.942365
Unknown	-0.083540	-0.023994	-0.109509	-0.162236	0.883884	0.921717	1.000000	0.945789
Total	-0.086043	-0.043816	-0.114402	-0.150975	0.966446	0.942365	0.945789	1.000000

	Model	Total	Male	Female	Unknown	Manufacturer	Price
0	09-May	44398	35025	7000	2374	Saab	52.957194
1	100	40026	33128	5716	1419	Audi	37.326360
2	1007	7130	3347	3486	417	Peugeot	18.657000
3	104	4637	2384	2115	138	Peugeot	7.508333
4	105	6512	4196	2209	107	Skoda	4.626333
...	...	...	...	...	...	...	...
480	Z4	29553	19343	8462	1748	BMW	64.109625
481	ZR	48479	28381	18426	1736	MG	21.023333
482	ZT	17280	14496	2287	513	MG	42.746667
483	ZX	128499	81329	42312	5475	Citroen	16.056769
484	Zafira	1259	762	415	116	Opel	35.270000

Car Data Analysis¶

Step 1: Importing Required Libraries¶

Step2: Reading the CSV File¶

Step3: Analyzing Dataset¶

Step 4: Task 1(Creating a Visual to show Top 10 Vehicle Ownership According to Gender)¶

Step5: Task2: (Determine the Error in Price and Power Columns and Fixing Them)¶

Since there are no missing values we check for outliers using PDF graph¶

Step 5(A) : Checking Outliers in Price Column¶

Step 5(B): Dealing with price outliers¶

Step5(C): Checking Outliers in Power¶

Step5(D): Fixing Outliers¶

Step 6: Task 3(Seeing the Correlation of Male Column with Other Features)¶

	Manufacturer	Model	Price	Power
count	5883	5883	5883.000000	5883.000000
unique	51	480	NaN	NaN
top	Ford	Space	NaN	NaN
freq	329	49	NaN	NaN
mean	NaN	NaN	37.101089	106.933197
std	NaN	NaN	31.143678	58.569440
min	NaN	NaN	3.332200	15.000000
25%	NaN	NaN	18.192500	70.000000
50%	NaN	NaN	28.608205	94.000000
75%	NaN	NaN	43.068421	120.000000
max	NaN	NaN	260.287000	433.000000

	Manufacturer	Model	Price	Power
count	5824	5824	5824.000000	5824.000000
unique	50	471	NaN	NaN
top	Ford	Space	NaN	NaN
freq	329	49	NaN	NaN
mean	NaN	NaN	35.718906	104.222356
std	NaN	NaN	27.787608	52.158339
min	NaN	NaN	3.332200	15.000000
25%	NaN	NaN	18.167154	69.000000
50%	NaN	NaN	28.379000	93.000000
75%	NaN	NaN	42.341317	118.000000
max	NaN	NaN	200.198250	316.000000

	Model	Total	Male	Female	Unknown	Manufacturer	Price
213	Fiesta	1561512	737836	789633	52404	Ford	18.532143
202	Escort	1527071	1007113	501907	38200	Ford	16.208744
214	Focus	1273125	796190	421303	55632	Ford	30.619322
393	Sierra	946240	781210	150811	14367	Ford	18.295562
228	Golf	824931	471514	306459	46958	Volkswagen	31.242154
...	...	...	...	...	...	...	...
460	X-90	1147	417	712	64	Suzuki	16.641000
449	Vel	1132	932	161	52	Renault	51.641562
188	DS5	1081	882	160	39	Citroen	42.557333
258	Korando	1080	786	259	35	Ssangyong	37.904000
232	Griffith	1078	972	73	43	TVR	71.187000

	Model	Total	Male	Female	Unknown	Manufacturer	Price
213	Fiesta	1561512	737836	789633	52404	Ford	18.532143
202	Escort	1527071	1007113	501907	38200	Ford	16.208744
214	Focus	1273125	796190	421303	55632	Ford	30.619322
393	Sierra	946240	781210	150811	14367	Ford	18.295562
228	Golf	824931	471514	306459	46958	Volkswagen	31.242154
297	Mondeo	759309	594469	138468	26609	Ford	39.973750
169	Clio	693039	269970	390458	32611	Renault	22.100000
30	2000	652975	445497	193015	19517	Rover	17.182000
33	206	568218	199330	339260	30018	Peugeot	20.033750
48	320i	567097	408016	131916	29801	BMW	47.848370